In [1]:
import pandas as pd
%pylab inline
In [2]:
df = pd.read_csv("weather.csv", header=0, index_col=0)
df
Out[2]:
In [3]:
mean_temp = df["temperature"].mean()
mean_temp
Out[3]:
In [4]:
mean_humidity = df["humidity"].mean()
mean_humidity
Out[4]:
In [5]:
temp_selector = df['temperature'] > mean_temp
df[temp_selector][["outlook", "play"]]
Out[5]:
In [6]:
humidity_selector = df['humidity'] > mean_humidity
df[humidity_selector][["outlook", "play"]]
Out[6]:
In [7]:
df["temp_C"] = ( df["temperature"] - 32 ) * (5/9.0)
df
Out[7]:
In [8]:
play_selector = df["play"]=="yes"
play_days = df[play_selector]
len(play_days)
Out[8]:
In [9]:
sunny_selector = df["outlook"]=="sunny"
sunny_play_days = df[sunny_selector & play_selector]
len(sunny_play_days)
Out[9]:
In [10]:
print play_days["temperature"].mean()
print play_days["temperature"].min()
print play_days["temperature"].max()
In [11]:
print play_days["humidity"].mean()
print play_days["humidity"].min()
print play_days["humidity"].max()
In [20]:
pyplot.ylabel('Temperature')
pyplot.xlabel("Humidity")
pyplot.scatter(x=play_days["humidity"], y=play_days["temperature"], c='green')
no_play_days = df[df["play"]=="no"]
pyplot.scatter(x=no_play_days["humidity"], y=no_play_days["temperature"], c='red', marker="x")
pyplot.legend(['Play', "No Play"])
Out[20]:
The only inferences I can make from the scatter plot above, is that you always play when the humidity is between 70 and 85. Temperature seems to play no part of the decision process when you go out to play as from teh scatter plot the plays and no play poionts are evenly distributed across the y axis (Temperature).
We have a set of 8 files where the first 7 and the last 2 have a different format. First I removed the header information from the files and removed any superfolous line breaks, I then read them into pandas in two respective groups. I then had to normalize the dates of teh second dataset to match the dates of the first. Also I had to normalize the values of the first dataset b/c they were in units of 1000, so I made it in units of 1.
In [37]:
#these are in units of thousands, need to scale
df1 = pd.read_fwf("processed/st0009ts.txt", header=0, index_col=0, thousands=",").transpose()
df2 = pd.read_fwf("processed/st1019ts.txt", header=0, index_col=0, thousands=",").transpose()
df3 = pd.read_fwf("processed/st2029ts.txt", header=0, index_col=0, thousands=",").transpose()
df4 = pd.read_fwf("processed/st3039ts.txt", header=0, index_col=0, thousands=",").transpose()
df5 = pd.read_fwf("processed/st4049ts.txt", header=0, index_col=0, thousands=",").transpose()
df6 = pd.read_fwf("processed/st5060ts.txt", header=0, index_col=0, thousands=",").transpose()
df7 = pd.read_fwf("processed/st6070ts.txt", header=0, index_col=0, thousands=",").transpose()
df = pd.concat([df1, df2, df3, df4, df5, df6, df7])
#scale up to unit of 1
df = df.apply(lambda x: x*1000)
#for some reason, this dataset format uses '.'s in U.S. but doesn't for anything else. We'll normalize it here
df[["U.S."]]
df.rename(columns={'U.S.': 'US'}, inplace=True)
In [38]:
#the file format changes here
transform = lambda x: "19"+x[2:4]
df_9 = pd.read_fwf("processed/st7080ts.txt", header=0, index_col=0, thousands=",").transpose()
df_9.index = df_9.index.map(transform)
df_10 = pd.read_fwf("processed/st8090ts.txt", header=0, index_col=0, thousands=",").transpose()
df_10.index = df_10.index.map(transform)
df_10
df_2 = pd.concat([df_9, df_10])
In [39]:
# now merge the two together to get the compleete mergered df
df = pd.concat([df, df_2])
df=df.sort_index() #sort
In [40]:
df[["CA", "AK"]].plot()
Out[40]:
In [41]:
df["New England"] = df[["CT", "ME", "MA", "NH", "RI", "VT"]].sum(axis=1)
df["Southwest"] = df[["AZ", "CA", "CO", "NV", "NM", "TX", "UT"]].sum(axis=1)
In [42]:
df[["New England", "Southwest"]].plot()
Out[42]:
In [ ]:
We can quantify population growth in direct terms or relativly using percentages:
In [43]:
#remove a few composite columns:
df.drop('US', axis=1, inplace=True)
df.drop('Southwest', axis=1, inplace=True)
df.drop('New England', axis=1, inplace=True)
In [44]:
delta = {}
rel_delta={}
for state in df.columns:
delta[state]=df[state].iloc[-1] - df[state].iloc[50]
rel_delta[state] = (df[state].iloc[-1] - df[state].iloc[50]) / df[state].iloc[50]*1. * 100
ddf=pd.DataFrame(delta, index=["delta"]).transpose()
ddf = ddf.sort(["delta"], ascending=False)
ddf.head()
Out[44]:
As you can see from teh table above, CA had the largest growth in terms of raw numbers for the time period. However, we can gain additional insites by looking at percentatge growth.
In [56]:
ddp=pd.DataFrame(rel_delta, index=["% change"]).transpose()
ddp = ddp.sort(["% change"], ascending=False)
ddp.head()
Out[56]:
Some states had no net growth and some had negative growth:
In [61]:
ddp.tail(n=10)
Out[61]:
In [87]:
from sklearn import tree
import numpy as np
In [81]:
wine = np.loadtxt("wine.data", delimiter=',')
#Get the targets (first column of file)
Y = wine[:, 0]
#Remove targets from input data
X = wine[:, 1:]
In [102]:
#lets split into a test and training set
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=9)
In [110]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
In [111]:
clf.score(X_test, Y_test)
Out[111]:
In [112]:
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(3)
plt.xticks(tick_marks, ["1", "2", "3"], rotation=45)
plt.yticks(tick_marks, ["1", "2", "3"])
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [113]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix
y_true = Y_test
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
plt.show()
As you can see from the confusion matrix, inputs of Class 1 & 2 were perfectly classified. There were only 2 mistakes on Class 3.
In [165]:
odf = pd.read_csv("hmwk_1_data/AHW_1.csv")
odf.head()
Out[165]:
In [ ]:
In [174]:
odf["Age"].plot(kind="hist")
odf["Age"].describe()
Out[174]:
In [175]:
odf["Weight"].plot(kind="hist")
odf["Weight"].describe()
Out[175]:
In [176]:
odf["Height"].plot(kind="hist")
odf["Height"].describe()
Out[176]:
In [164]:
odf.isnull().sum()
Out[164]:
In [167]:
male = odf["Sex"]=="M"
female = odf["Sex"]=="F"
odf[male]["Age"].plot(kind="hist")
odf[female]["Age"].plot(kind="hist")
Out[167]:
In [170]:
odf[male]["Weight"].plot(kind="hist")
odf[female]["Weight"].plot(kind="hist")
Out[170]:
In [172]:
odf[male]["Height"].plot(kind="hist")
odf[female]["Height"].plot(kind="hist")
Out[172]:
In [177]:
odf.describe()
Out[177]:
We can see a correlation between height and weight
In [229]:
from pandas.tools.plotting import scatter_matrix
pd.scatter_matrix(odf, alpha=0.2, figsize=(10, 10), diagonal='kde')
Out[229]:
In [232]:
odf["lbs"] = odf["Weight"] * 2.20462
odf.head()
Out[232]:
In [234]:
pd.scatter_matrix(odf, alpha=0.2, figsize=(10, 10), diagonal='kde')
Out[234]:
In [238]:
odf["w+h"] = odf["Weight"] + odf["Height"]
odf.drop('lbs', axis=1, inplace=True)
odf.head()
In [239]:
pd.scatter_matrix(odf, alpha=0.2, figsize=(10, 10), diagonal='kde')
Out[239]:
In [247]:
odf["BMI"] = odf["Weight"] / ((odf["Height"]*0.01)**2)
odf.head()
Out[247]:
In [269]:
odf[male]["BMI"].plot(kind="hist")
odf[female]["BMI"].plot(kind="hist")
print odf[male]["BMI"].describe()
print
print odf[female]["BMI"].describe()
In [291]:
sports = list(set(odf["Sport"]))
sports
# choose 3 random sports
sports
Out[291]:
In [290]:
import random
random_sports = random.sample(sports, 3)
for sport in random_sports:
sport_selector = odf["Sport"] == sport
odf[sport_selector].plot(kind="scatter", x="Height", y="Weight", marker='x')